* this script produces all the numbers and figures of Spamann et al., "The Reasons Highest Courts Give" as well as an Excel file containing all the processed data labelled in English
* tested on Stata 19
* if you open Stata by opening this file from its unzipped location, then Stata should automatically set that location as the main directory; if not, use cd to change to that.

clear all
cap mkdir graphs // this creates a folder where to store the graphs if it doesn't exist yet -- it will be a subdirectory to whatever the current directory is

program define labels
	label define janein 1 "Ja" 0 "Nein", replace 
	label define yesno 1 "Yes" 0 "No", replace
	label define country 0 "England" 1 "France" 2 "Germany", replace
	label define decade 0 "1880-1889" 1 "2007-2016", replace
	label define sample 0 "random" 1 "important", replace
	end
program scaling // takes as argument a varlist
	foreach var in `0' {
		if !inlist("`var'","words","dissent","firstinstance") qui replace `var' = `var'/words
	}
	end


***********************************************************************************
***********************************************************************************
**** Data Ingest ******************************************************************

* France -- already in right format/labeling
import excel F_table.xlsx, sheet("Sheet1") firstrow clear
drop if mi(country)
drop country

labels // label defs must be called AFTER the import command or else the latter overwrites them

gen byte country = "France":country

rename decade sample firstinstance dissent, proper
encode Decade, gen(decade) label(decade)
encode Sample, gen(sample) label(sample)
encode Firstinstance, gen(firstinstance) label(yesno)
encode Dissent, gen(dissent) label(yesno)
drop Decade Sample Firstinstance Dissent

compress
tempfile France
save `France'


* England and Germany

import excel "Master table.xlsx", sheet("Overview") firstrow clear // if you have extracted this Dataverse in one set, the table should reside in the same folder as this script; otherwise, change directory information as necessary
drop if mi(Name)
drop A
labels

gen byte country = "Germany":country if inlist(Gericht,"BGH","RG")
replace country = "England":country if mi(country)
label values country country

encode Zeitraum, gen(decade) label(decade)
label var decade decade

gen byte sample = Sample=="Wichtig"
label values sample sample

encode Vorinstanz, label(janein) gen(firstinstance)
label values firstinstance yesno

rename Gericht court
rename Name casename
rename Anzahl words 
rename Urteileinsgesamt cases_cited
rename VerwaufUrteileinsg cases_citations
rename VerwaufUrteilemitSV cases_facts
rename ZustimmungzuRspr cases_approving
rename BelegvonRauffassung cases_forsupport
rename StRsprAlsBelegvonRauffas cases_stRspr
rename AusdrVerwaufGründe cases_reasons
rename Gründewörtlichzitiert cases_quotereasons
rename Verwerfungoverruling cases_overrule
rename KritikohneVerwerfung cases_criticize
rename Abgrenzungdistinguishing cases_distinguish
rename VerweisaufandereAspekteeines cases_other
rename Sonderfälle cases_specialcase 
rename Mosaikbetrachtung cases_mosaic
rename Vorschrifteninsgesamt statutes_cited
rename Verweiseinsgesamt statutes_citations
rename BloßeVerweise statutes_mention
rename WörtlichesZitat statutes_quote
rename DiskussionAuslegung statutes_interpret
rename Literaturquelleninsgesamt lit_sources
rename AB lit_citations
rename LitdieRsprZustimmt lit_agreecaselaw
rename alsBeleg lit_support
rename ohneStellungnahme lit_nocomment
rename alsaA lit_opposingview
rename Diskussionzust lit_discuss_approve
rename Diskussionabl lit_discuss_reject
rename Gesetzgebungsmaterialien leghist

replace court="HL" if court=="UKHL"

encode AbweichendeVoten, label(janein) gen(dissent)
label values dissent yesno

drop Vorinstanz Zeitraum Sample Abweich

order country decade sample court casename
compress


* join all three countries and save

append using `France'
export excel using master_table_processed_incl_France.xlsx, firstrow(var) replace // this will overwrite the identically named file downloaded from Dataverse, if still in the same folder

*****************************************************************
*** checking structure of data / consistency of coding **********

count if statutes_citations<statutes_cited
assert r(N)==4 // explanation: statutes are counted by provision, whereas citations are counted by "block": e.g., "sections 1 and 3 of the X Law" is 2 statutes, 1 citation
assert statutes_citations == statutes_mention + statutes_quote + statutes_interpret

assert lit_citations>=lit_sources
assert lit_citations == lit_agreecaselaw + lit_support + lit_nocomment + lit_opposingview + lit_discuss_approve + lit_discuss_reject

assert cases_citations>=cases_cited
egen cases_total = rowtotal(cases_approving - cases_mosaic)
assert cases_total>=cases_citations // "mosaic" and reasons mentioned could be in addition to other citations
count if cases_total>cases_citations

correl statutes_citations statutes_cited
correl lit_citations lit_sources
correl cases_total cases_citations
sum statutes_citations statutes_cited lit_citations lit_sources cases_total cases_citations

drop cases_total


****************************************************************
*** generating helper macros
ds court casename country decade sample, not
global varlist `r(varlist)'

count if country==0 & decade==0
scalar N_g = r(N)


****************************************************************
****************************************************************
**** Graphs of distributions ***********************************

* generate summary variables & globals of variables to graph over (+ locals as their associated graph titles)
egen cases_summary = rowtotal(cases_approving cases_forsupport cases_stRspr)
egen cases_reasquo = rowtotal(cases_reasons cases_quotereasons)
egen cases_remainder = rowtotal(cases_criticize cases_other cases_specialcase)
egen lit_positive = rowtotal(lit_agreecaselaw lit_support lit_nocomment)

global corevars "words cases_cited statutes_cited lit_sources"
global casevars "cases_summary cases_facts cases_reasquo cases_overrule cases_distinguish cases_remainder"
global statvars "statutes_mention statutes_quote statutes_interpret leghist"
global litvars "lit_positive lit_opposingview lit_discuss_approve lit_discuss_reject"

local corevars "Length, and Number of Authorities Cited"
local casevars "Citations to Prior Cases"
local statvars "Citations to Statutes"
local litvars "Citations to Literature"

* variable labels for titles
label var words "Words"
label var cases_cited "Cases Cited"
label var statutes_cited "Statutes Cited"
label var lit_sources "Literary Sources Cited"

label var cases_summary "Summary Cites"
label var cases_facts "Facts Mentioned"
label var cases_reasquo "Reasons Mentioned"
label var cases_overrule "Overruled"
label var cases_distinguish "Distinguished"
label var cases_remainder "Other Non-Summary"

label var statutes_mention "Summary Cites"
label var statutes_quote "Quotes"
label var statutes_interpret "Interpretation"
label var leghist "Legislative History"

label var lit_positive "Positive or Comment-Less Cite"
label var lit_opposingview "Cited as Opposing View"
label var lit_discuss_approve "Discuss & Approve"
label var lit_discuss_reject "Discuss & Reject"



* graphs

foreach vars in corevars casevars statvars litvars {
	preserve // this enables dropping France as necessary -- adjust "inlist" command below as desired
	
	if inlist("`vars'","corevars","casevars","statvars","litvars") { // France stays in, so need different spacing etc
		if "`vars'" == "casevars" local w = 0.4
		else local w = 0.5
		local rank "_n + (N_g+1)*country + 3.5*N_g*decade"
		*if "`vars'"=="casevars" local xlabel `"xlabel(`=0.5*(N_g+1)' "E" `=1.5*(N_g+1)' `" "F" "1880-1889" "' `=2.5*(N_g+1)' "G" `=3.5*N_g+0.5*(N_g+1)' "E" `=3.5*N_g+1.5*(N_g+1)' `" "F" "2007-2016" "' `=3.5*N_g+2.5*(N_g+1)' "G", noticks)"' // abbreviate countries to one letter to save space
		local xlabel `"xlabel(`=0.5*(N_g+1)' "ENG" `=1.5*(N_g+1)' `" "FRA" "1880-1889" "' `=2.5*(N_g+1)' "GER" `=3.5*N_g+0.5*(N_g+1)' "ENG" `=3.5*N_g+1.5*(N_g+1)' `" "FRA" "2007-2016" "' `=3.5*N_g+2.5*(N_g+1)' "GER", noticks)"'
	} 
	else {
		drop if country=="France":country
		local w = 0.5
		local rank "_n + (N_g+1)*country/2 + 2.5*N_g*decade" // hack -- Germany is encoded as 2, so country/2 is 1 for Germany and remains zero for ENG
		local xlabel `"xlabel(`=0.5*(N_g+1)' "ENG" `=N_g+1' `" " " "1880-1889" "' `=1.5*(N_g+1)' "GER" `=2.5*N_g+0.5*(N_g+1)' "ENG" `=2.5*N_g+N_g+1' `" " " "2007-2016" "' `=2.5*N_g+1.5*(N_g+1)' "GER", noticks)"'
	}
	
foreach var of varlist $`vars' {
	bys country decade (`var'): gen rank = `rank'
	tsset rank
	tsfill // this is necessary to prevent the line graph from connecting the groups' means
	bys country decade: egen mu = mean(`var')
	twoway spike `var' rank if country==0 & sample==1, color("0 114 178") lwidth(`w') /// colors recommended by plotplainblind
		|| spike `var' rank if country==1 & sample==1, color("0 158 115") lwidth (`w') /// note that France won't be drawn if dropped for this graph
		|| spike `var' rank if country==2 & sample==1, color("230 159 0") lwidth (`w') ///
		|| spike `var' rank if country==0 & sample==0, color("0 114 178%60") lwidth(`w') ///
		|| spike `var' rank if country==1 & sample==0, color("0 158 115%60") lwidth(`w') ///
		|| spike `var' rank if country==2 & sample==0, color("230 159 0%60") lwidth(`w') ///
		|| line mu rank, sort(rank) cmissing(n) lcolor(black) ///
		|| , ylabel(minmax) yscale(r(0)) ytitle("") `xlabel' xtitle("") ///
		legend(off) title("`: var label `var''") scheme(s1color) name(`var', replace)
	drop if mi(`var') // get rid of the tsfill obs
	drop rank mu
}

graph combine $`vars', scheme(s1color) title("``vars''") saving(graphs/distribution3_`vars', replace) ///
	caption("Graphs show the distribution of feature counts over opinions by jurisdiction and decade as indicated in the graphs." ///
	"Each vertical bar represents one opinion. The height of the bar represents the feature count for that opinion." ///
	"Within each jurisdiction-decade, opinions are displayed in ascending order of the respective feature, with a horizontal bar representing their mean." ///
	"Randomly selected opinions are drawn in a lighter shade than cases selected for their importance.", ///
	size(vsmall))
graph export graphs/distribution3_`vars'.emf, replace
restore // gets France back in if dropped at beginning of loop
}



********************************************************************
********************************************************************
*** Testing and displaying differences by groups *******************

local collinear "cases_citations lit_citations statutes_citations" // the latter two are sums of other variables; the first almost so
local headervars "words dissent firstinstance" // these are on different scales, arguably different in kind
local testlist_all: list global(varlist) - collinear
local testlist_main: list testlist_all - headervars


*******************************************
*** multivariate tests if groups matter at all ***

tempname manovaresults
frame create `manovaresults' byte(scaled interacted) str13 testlist float(sample country decade countrydecade)
forvalues scaled=0/1 { // we do it once for raw counts, and once scaled by number of words in opinion
forvalues interacted=0/1 {
foreach testlist in all main {
	local t "`testlist_`testlist''" // for readability
	preserve // to revert to unscaled data later, if applicable
	if `scaled' scaling `t'
	if `interacted'	{
		qui manova `t' = sample country##decade
		scalar p4 = e(stat_4)["Pillai","pvalue"]
	}
	else {
		qui manova `t' = sample country  decade
		scalar p4 = .
	}				
	frame post `manovaresults' (`scaled') (`interacted') ("`testlist'") ///
		(e(stat_1)["Pillai","pvalue"]) (e(stat_2)["Pillai","pvalue"]) (e(stat_3)["Pillai","pvalue"]) (p4)
	restore
}
}
}

frame change `manovaresults'
format sample country decade countrydecade %7.5f
list scaled interacted testlist sample country decade countrydecade
frame change default
frame drop `manovaresults'


**********************************************
*** clustering ********************************

local testlist: list global(varlist) - headervars // if I define this before the manova loop that uses "testlist", this macro definition gets lost!

gen byte England=country=="England":country
gen byte Germany=country=="Germany":country
foreach method in means medians {
	forvalues decade=0/1 {
		cluster k`method' `testlist' if decade==`decade', k(2) gen(family_`method'_`decade')
		tab Germany family_`method'_`decade', exact
	}
}

foreach var of varlist `testlist' {
	gen `var'_scaled = `var'/words
	}
	
foreach method in means medians {
	forvalues decade=0/1 {
		cluster k`method' *_scaled if decade==`decade', k(2) gen(family_`method'_scaled_`decade')
		tab Germany family_`method'_scaled_`decade', exact
	}
}
	
drop *_scaled England Germany family_*

**********************************************
*** Principal Component plots ****************

foreach var of varlist `testlist' {
	gen `var'_scaled = `var'/words
	}
decode country, gen(marker)
replace marker = substr(marker,1,1)
	
pca $varlist
scoreplot, msymbol(none) mlabel(marker) name(unscaled) title("Unscaled features")
pca *_scaled
scoreplot, msymbol(none) mlabel(marker) name(scaled) title("Scaled features") 
graph combine unscaled scaled, title("Scores for First Two Principal Components") ///
	caption("Plots of the first two principal component scores for each opinion. E, F, and G mark opinions from England, France, and Germany, respectively." ///
	"The components and scores are calculated from all features listed in Table 1 except the first three (words, dissent, first instance)." ///
	"The left plot uses the raw features. The right plot first divides all features by the opinion's number of words.", size(vsmall) justification(center)) ///
	saving(graphs/PCA, replace)
graph export graphs/PCA.emf, replace
drop *_scaled

**********************************************
*** calculating & testing distances ********

mat c = I(_N)-(J(_N,1,1)#J(1,_N,1/_N)) // right-multiplying by c centers the data, i.e., subtracts out the mean

program distances, rclass // calculates average distances between courts within decade, and between common (E) and civil (F, G) law
	sort decade country sample
	mkmat words, matrix(w)
	mkmat `0', matrix(X)
	forvalues scaled=0/1 {
		if `scaled' mat X = invsym(diag(w))*X // this divides each entry by the number of words in the opinion. CAREFUL: changes X
		mat cX = c*X // centers X
		mat Cinv = invsym(cX'*cX*(1/(_N-1))) // inverse covariance matrix of X
		forvalues decade=0/1 {
			forvalues c1=0/2 { // country 1
				forvalues c2=`=`c1'+1'/2 { // country 2
					local E`decade'`c1'`c2' = 0 // initialize Euclidean distance
					local M`decade'`c1'`c2' = 0 // initialize Mahalanobis distance
					forvalues sample=0/1 { // sample
						forvalues i1=1/10 { // loop through all opinions in the country-decade-sample set for country 1 ...
							local r1 = `i1' + 10*`sample' + 20*`c1' + 60*`decade'
							forvalues i2=1/10 { // ... and country 2
								local r2 = `i2' + 10*`sample' + 20*`c2' + 60*`decade'
								matrix dx = X[`r1',1...] - X[`r2',1...] // can't use name "d" because use that above (for diff in means)
								mat E = dx*dx'
								mat M = dx*Cinv*dx'
								local E`decade'`c1'`c2' = `E`decade'`c1'`c2'' + sqrt(E[1,1])
								local M`decade'`c1'`c2' = `M`decade'`c1'`c2'' + sqrt(M[1,1])
							}
						}
					}
					local E`decade'`c1'`c2' = `E`decade'`c1'`c2'' / 200 // dividing by (2 samples) * (10*10 distances)
					local M`decade'`c1'`c2' = `M`decade'`c1'`c2'' / 200
				}
			}
		}
		foreach d in E M {
			mat `d'`scaled' = ``d'002', ``d'001', ``d'012', `=(``d'002'+``d'001')/2' \ ``d'102', ``d'101', ``d'112', `=(``d'102'+``d'101')/2'
			mat `d'`scaled' = `d'`scaled' \ `d'`scaled'/``d'012' // normalizing by F-G distance 1880s
			mat colnames `d'`scaled' = E_G_`d'`scaled' F_E_`d'`scaled' G_F_`d'`scaled' comciv_`d'`scaled'
			mat rownames `d'`scaled' = 1880 2010 1880n 2010n
			mat `d'`scaled'_cc = `d'`scaled'[1...,4] // this is com (E) vs. civ (F, G) 
			mat `d'`scaled' = `d'`scaled'[1...,1..3]
		}
	}
	matrix dist = E0 , M0 , E1 , M1, E0_cc, M0_cc, E1_cc, M1_cc
	return matrix dist = dist
	end

program ddist, eclass // d for differences in distances between 1880s and 2010s
	distances `0'
	tempname d
	matrix `d' = r(dist)
	matrix `d' = `d'[2,1...] - `d'[1,1...]
	forvalues m=1/4 { // adding change in distance com-civ relative to change in distance F-G, for each of the four distance metrics
		matrix `d' = `d', `d'[1,`=12+`m'']/`d'[1,`=`m'*3']
	}
	ereturn post `d'
	end
	
program idist, eclass // i for individual distances -- it's a crutch to get the results of distances into the e(b) macro, which permute can understand
	syntax varlist, decade(integer)
	local r = `decade' + 1
	distances `varlist'
	tempname d
	matrix `d' = r(dist)
	matrix `d' = `d'[`r',1...]
	ereturn post `d'
	end


distances `testlist'
mat distances = r(dist)
mat list distances

bootstrap _b, strata(country decade sample) reps(1000) dots(10): ddist `testlist'
mat bootstrapp = r(table)
mat bootstrapp = bootstrapp["pvalue",1...]
mat bootstrapp = bootstrapp[1,1..16] \ J(1,12,.), bootstrapp[1,17..20]
mat rownames bootstrapp = p_diff p_diff_rel

tempname p0 p1 cc
gen byte permset = . // to compare two countries, we will mark one as left out -- we don't want to touch its designation
forvalues decade=0/1 {
	foreach leftout in 1 2 0 { // first leave out F to get test between E (1st column) and G, then leave out G to get test between F (2nd col) and E, etc.
		di "now permuting countries that are not `: label country `leftout'' for decade `: label decade `decade''"
		qui replace permset = country!=`leftout'
		permute country _b, reps(1000) strata(permset decade sample) nodots nowarn noheader nolegend: idist `testlist', decade(`decade')
		mat `p`decade'' = nullmat(`p`decade'') \ r(p_twosided)
	}

	local c = colsof(`p`decade'')
	local r = rowsof(`p`decade'')
	forvalues i=1/`=`c'/`r'' { 
		local begin = 1+(`i'-1)*`r'
		local end = `i'*`r'
		mat `p`decade''[1,`begin'] = vecdiag(`p`decade''[1...,`begin'..`end'])
	}
	mat `p`decade'' = `p`decade''[1,1..12] // keep the first rows only, which is where we just moved all the relevant p-values, and only for country comparisons
	
	di "now permuting common v civil law, for decade `: label decade `decade''" // permuting all countries = implicitly permuting common (E) vs. civil (F, G)
	permute country _b, reps(1000) strata(decade sample) nodots nowarn noheader nolegend: idist `testlist', decade(`decade')
	mat `cc' = r(p_twosided)
	mat `p`decade'' = `p`decade'', `cc'[1,13...] 
}

mat p = `p0' \ `p1' 
mat rownames p = p1880 p2010
mat distances = distances \ p \ bootstrapp

preserve
clear
svmat distances, names(col)
local rownames: rownames distances
local N: rowsof distances
gen str5 var = ""
forvalues i=1/`N' {
	qui replace var = "`:word `i' of `rownames''" in `i'
}
order var
export excel using distances.xlsx, firstrow(variables) sheet("Sheet1", modify) keepcellfmt // "modify" preserves some Excel functions to facilitate copy&paste
restore
*/

******************************************
*** variable by variable --> output to table *****************

foreach comparison in raw scaled { // we do it once for raw counts, and once scaled by number of words in opinion
	preserve // to revert to unscaled data later

	* scaling, if applicable
	if "`comparison'" == "scaled" {
		foreach var of global varlist {
			if !inlist("`var'","words","dissent","firstinstance") replace `var' = 100000*`var'/words // per 100,000 words
		}
	}

	* create frame to hold test results
	tempname tests
	frame create `tests' str20(var) byte(decade country) p

	* pairwise tests
	foreach var of global varlist {
		forvalues decade=0/1 { // country vs. country (within decade)
			forvalues leftout=0/2 {
				qui: ranksum `var' if decade==`decade' & country!=`leftout', by(country) exact
				frame post `tests' ("`var'") (`decade') (`leftout') (r(p_exact)) // NB: country saved is leftout -- need to adjust below
			}
		}
		forvalues country=0/2 { // decade vs. decade (within country)
			qui: ranksum `var' if country==`country', by(decade) exact
			frame post `tests' ("`var'") (2) (`country') (r(p_exact)) // saving this by "pseudo-decade"=2
		}
	}

	* Benjamini-Hochberg (5% FDR)
	frame change `tests'
	recode country (0 = 2) (1 = 0) (2 = 1) if decade!=2 // this shifts from the "leftout" country to the "right" country in the test
	bys decade country (p): gen byte BHpass = p<0.05*_n/_N
	by  decade country: egen BHcutoff = max(p*BHpass)
	list decade country BHcutoff if mod(_n,_N/9)==1
	recode BHpass (0=1) if p<BHcutoff
	drop p BHcutoff

	* saving in a shape to merge into table
	reshape wide BHpass, i(var decade) j(country)
	tempfile BHpasses
	save `BHpasses'

	* getting the means
	frame change default
	collapse (mean) $varlist, by(decade country)
	replace dissent = dissent*100 // because we show this as a percentage
	replace firstinstance = firstinstance*100 // id.
	rename ($varlist) v#, addnumber // in order to use the reshape command -- see recovery two lines down
	reshape long v, i(decade country) j(counter)
	gen str20 var = "" // from here through the foreach loop, I recover the original variable names, i.e., I undo the rename two lines above
	local counter = 1
	foreach var of global varlist { 
		qui replace var = "`var'" if counter==`counter++'
	}
	drop counter
	replace v = round(v)
	tostring v, replace
	replace v = v + "%" if inlist(var,"dissent","firstinstance") // because we show these two as percentages
	reshape wide v, i(decade var) j(country)
	rename (v0 v1 v2) (ENG FRA GER)

	* merge in the BHpasses
	* for country-vs-country tests
	merge 1:1 decade var using `BHpasses', nogenerate keep(3) assert(2 3)
	rename BHpass# cdiff# // this allows referencing cdiff* flexibly for formatting. Alternatively, attach stars using replace x = x + "*" if cdiff[x]
	* for decade-vs-decade tests
	recode decade (1=2) // temporary crutch for the merger -- recall we saved these tests as "pseudo-decade"=2 -- reversed two lines down
	merge 1:1 decade var using `BHpasses', nogenerate keep(3) assert(2 3) // in main data, decades are now 0 and 2 --> both matched. Decade 1 from using isn't.
	recode decade (2=1)
	recode BHpass* (nonmissing = .) if decade==0
	rename BHpass# tdiff#

	* sort for presentation in table and export .xlsx
	local order "words dissent firstinstance cases_cited cases_citations cases_approving cases_forsupport cases_stRspr cases_reasons cases_quotereasons cases_overrule cases_criticize cases_distinguish cases_other cases_specialcase cases_mosaic cases_facts statutes_cited statutes_citations statutes_mention statutes_quote statutes_interpret leghist lit_sources lit_citations lit_agreecaselaw lit_support lit_nocomment lit_opposingview lit_discuss_approve lit_discuss_reject"
	gen byte sortorder = .
	local i = 1
	foreach r in `order' {
		qui replace sortorder = `i++' if var == "`r'"
	}
	sort decade sortorder
	drop sortorder
	export excel using T1`comparison'.xlsx, firstrow(variables) sheet("Sheet1", modify) keepcellfmt // in case it gets overwritten in spite of keepcellfmt: in Excel, apply formatting to C2:E63 using rule "=OFFSET(C2,0,x)=1", x=3 or 6 !

	frame drop `tests'

	restore
}